---
title: "NLP Analysis on the Customer Reviews of Top 5 Products"
output:
flexdashboard::flex_dashboard:
orientation: columns
source_code: embed
theme: sandstone
---
```{r setup, include=FALSE}
library(flexdashboard)
library(ggplot2)
library(plotly)
library(rJava)
library(dplyr)
library(tidytext)
library(tm)
library(ggplot2)
library(reshape2)
library(wordcloud)
library(RWeka)
library(qdap)
library(readr)
library(stringr)
library(pdfsearch)
library(h2o)
library(Rtsne)
library(wordcloud2)
library(ggthemes)
library(highcharter)
library(topicmodels)
library(plotly)
library(ivmte)
```
```{r}
review_reader <- function(product_id){
doc <- read_csv("C:/Users/somang.han/Downloads/womens-ecommerce-clothing-reviews/Womens Clothing E-Commerce Reviews.csv",col_names = T)
colnames(doc) <- c("num_comments","clothing_id","age","title","review_text","rating","recommended_ind","positive_feedback_count","division_name","department_name","class_name")
doc = doc[complete.cases(doc$review_text),] #getting rid of NAs
small_doc=doc[doc$clothing_id==product_id,]
corpus = VCorpus(VectorSource(small_doc$review_text))
sepNumbers <- function(x) {gsub("([0-9])([a-zA-Z])", "\\1 \\2", x)} #func to seperate num from words
corpus.ng = corpus %>% tm_map(removeWords,c(stopwords(),"s","ve","I","I'm","'m","'","you","your","were","etc","such","either","yes","dont","however","also","e","d","she","didn","he","could","couldn","which","will","had","did","when","doesn't","does","because","your","you","u","me","it's","its","ll","still","nor","am","ax","i","you","edu","s","t","m","subject","can","lines","re","what","there","all","we","one","the","this","still","org","of","or","in","for","by","on","'ll","but","is","in","a","an","with","as","was","if","they","are","this","and","it","from","at","my","be","not","that","to","com","org","don't","so","has","hasn't","haven't","have not")) %>% # remove stopwords
tm_map(removePunctuation) %>%
tm_map(content_transformer(sepNumbers)) %>%
tm_map(removeNumbers) # Remove numbers
corpus.ng
}
original_review <- function(product_id){
doc <- read_csv("C:/Users/somang.han/Downloads/womens-ecommerce-clothing-reviews/Womens Clothing E-Commerce Reviews.csv",col_names = T)
colnames(doc) <- c("num_comments","clothing_id","age","title","review_text","rating","recommended_ind","positive_feedback_count","division_name","department_name","class_name")
doc = doc[complete.cases(doc$review_text),] #getting rid of NAs
small_doc=doc[doc$clothing_id==product_id,]
small_doc
}
bi_gram_out <- function(clean_document){
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
tdm.bigram = TermDocumentMatrix(clean_document, control = list(tokenize = BigramTokenizer))
freq = sort(rowSums(as.matrix(tdm.bigram)),decreasing = TRUE)
freq.df = data.frame(word=names(freq), freq=freq)
head(freq.df,15) %>% hchart(type="bar", hcaes(x = word, y = freq)) %>% hc_title(text = "Top 15 Frequent Pair Words") %>% hc_colors("orange") %>% hc_add_theme(hc_theme_flat())
}
tri_gram_out <- function(clean_document){
TrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
tdm.trigram = TermDocumentMatrix(clean_document, control = list(tokenize = TrigramTokenizer))
freq = sort(rowSums(as.matrix(tdm.trigram)),decreasing = TRUE)
freq.df = data.frame(word=names(freq), freq=freq)
head(freq.df,15) %>% hchart(type="bar", hcaes(x = word, y = freq)) %>% hc_title(text = "Top 15 Frequent Three Consecutive Words") %>% hc_colors("green") %>% hc_add_theme(hc_theme_flat()) }
word_cloud_fun <-function(data){
mycorpus <- tm_map(data, removePunctuation)
dtm <- TermDocumentMatrix(mycorpus)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
#wordcloud2(d, size=1.5, color='random-light', backgroundColor="black")
wordcloud(words = d$word, freq = d$freq, min.freq = 20,
max.words=90, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
}
age_graph <- function(data){
df=as.data.frame(table(data$age))
colnames(df)<- c("age","value")
df=df %>% mutate(percent=apply(df, 1, function(x) round((as.numeric(x[2])/sum(df$value) * 100),1))) %>% arrange(desc(percent)) %>% head(10)
highchart() %>% hc_add_series(df, hcaes(x = age, y = percent), type = "pie") %>% hc_add_theme(hc_theme_flatdark()) %>% hc_title(text = "Percent of Age of Customer")
}
```
```{r}
#Womens_Clothing_E_Commerce_Reviews <- read_csv("C:/Users/somang.han/Downloads/womens-ecommerce-clothing-reviews/Womens Clothing E-Commerce Reviews.csv")
#colnames(Womens_Clothing_E_Commerce_Reviews) <- c("num_comments","clothing_id","age","title","review_text","rating","recommended_ind","positive_feedback_count","division_name","department_name","class_name")
#head(Womens_Clothing_E_Commerce_Reviews)
# sort(table(Womens_Clothing_E_Commerce_Reviews$clothing_id),decreasing = T)[1:5]
#top_5_review_product_id=names(sort(table(Womens_Clothing_E_Commerce_Reviews$clothing_id),decreasing = T)[1:5])
#top_5_review_product_info=Womens_Clothing_E_Commerce_Reviews[Womens_Clothing_E_Commerce_Reviews$clothing_id %in% #top_5_review_product_id,]
```
Clothing ID 862 Product Review Summary
=======================================================================
Column {data-width=600}
-----------------------------------------------------------------------
### Wordcloud of the Review
```{r}
word_cloud_fun(review_reader(862))
```
### Age Distribution of Customers
```{r}
age_graph(original_review(862))
```
Column {data-height=1000} {.tabset}
-----------------------------------------------------------------------
### Bigram
```{r fig.height=5}
bi_gram_out(review_reader(862))
```
### Trigram
```{r fig.height=5}
tri_gram_out(review_reader(862))
```
Clothing ID 1078 Product Review Summary
=======================================================================
Column {data-width=600}
-----------------------------------------------------------------------
### Wordcloud of the Review
```{r}
word_cloud_fun(review_reader(1078))
```
### Age Distribution of Customers
```{r}
age_graph(original_review(1078))
```
Column {data-height=1000} {.tabset}
-----------------------------------------------------------------------
### Bigram
```{r fig.height=5}
bi_gram_out(review_reader(1078))
```
### Trigram
```{r fig.height=5}
tri_gram_out(review_reader(1078))
```
Clothing ID 1094 Product Review Summary
=======================================================================
Column {data-width=600}
-----------------------------------------------------------------------
### Wordcloud of the Review
```{r}
word_cloud_fun(review_reader(1094))
```
### Age Distribution of Customers
```{r}
age_graph(original_review(1094))
```
Column {data-height=1000} {.tabset}
-----------------------------------------------------------------------
### Bigram
```{r fig.height=5}
bi_gram_out(review_reader(1094))
```
### Trigram
```{r fig.height=5}
tri_gram_out(review_reader(1094))
```
Clothing ID 1081 Product Review Summary
=======================================================================
Column {data-width=600}
-----------------------------------------------------------------------
### Wordcloud of the Review
```{r}
word_cloud_fun(review_reader(1081))
```
### Age Distribution of Customers
```{r}
age_graph(original_review(1081))
```
Column {data-height=1000} {.tabset}
-----------------------------------------------------------------------
### Bigram
```{r fig.height=5}
bi_gram_out(review_reader(1081))
```
### Trigram
```{r fig.height=5}
tri_gram_out(review_reader(1081))
```
Clothing ID 872 Product Review Summary
=======================================================================
Column {data-width=600}
-----------------------------------------------------------------------
### Wordcloud of the Review
```{r}
word_cloud_fun(review_reader(872))
```
### Age Distribution of Customers
```{r}
age_graph(original_review(872))
```
Column {data-height=1000} {.tabset}
-----------------------------------------------------------------------
### Bigram
```{r fig.height=5}
bi_gram_out(review_reader(872))
```
### Trigram
```{r fig.height=5}
tri_gram_out(review_reader(872))
```